#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@创建时间    : 2022/10/24  11:09
@作者  : st
@文件名: pkuseg_utils.py
@项目名: PyCharm
@文件描述:
    
"""
import json
import re

import pkuseg

from utils.constent import pkuseg_model_path, pkuseg_user_path


class PkusegUtils:
    def __init__(self):
        self.seg = pkuseg.pkuseg(model_name=pkuseg_model_path, user_dict=pkuseg_user_path)
        print('...loaded pkuseg dict')

    def split_pukseg(self, text):
        return self.seg.cut(text)


def dict_process():
    with open('../data/matadata/1020/all标准映射表.json', 'r', encoding='utf-8') as f:
        medical_dict = json.load(f)
    pat_num = '^\d+$'
    pat_en = '^[a-z]{1,2}$'
    dict_set = set()
    for k in medical_dict.keys():
        temp_dict = medical_dict[k]
        for key in temp_dict.keys():
            value = temp_dict[key]
            dict_set.add(key)
            dict_set.add(value)
    medical_list = []
    for vl in dict_set:
        if len(vl) < 2 or re.findall(pat_num, vl) or re.findall(pat_en, vl):
            continue
        medical_list.append(vl)
    medical_list = sorted(medical_list, key=lambda x:len(x), reverse=True)
    with open('../data/matadata/1020/medical_word_dict.txt', 'w', encoding='utf-8') as f:
        f.truncate()
        f.write('\n'.join(medical_list))


if __name__ == '__main__':
    dict_process()